R Markdown

Source of the data https://data.world/vizzup/mental-health-depression-disorder-data

## importing libraries

library(readxl)
library(plotly)
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggmap)
library(maps)
library(corrplot)
library(lmtest)
library(car)
## Importing the mental health data set

mental_health <- read_excel("C:/Users/Marti/OneDrive/Desktop/R_Project_2023/mental_health_data.xlsx")
head(mental_health)
## # A tibble: 6 x 10
##   Entity      Code   Year `Schizophrenia (%)` `Bipolar disorder (%)`
##   <chr>       <chr> <dbl>               <dbl>                  <dbl>
## 1 Afghanistan AFG    1990               0.161                  0.698
## 2 Afghanistan AFG    1991               0.160                  0.698
## 3 Afghanistan AFG    1992               0.160                  0.698
## 4 Afghanistan AFG    1993               0.160                  0.698
## 5 Afghanistan AFG    1994               0.160                  0.698
## 6 Afghanistan AFG    1995               0.160                  0.699
## # i 5 more variables: `Eating disorders (%)` <dbl>,
## #   `Anxiety disorders (%)` <dbl>, `Drug use disorders (%)` <dbl>,
## #   `Depression (%)` <dbl>, `Alcohol use disorders (%)` <dbl>
str(mental_health)
## tibble [6,468 x 10] (S3: tbl_df/tbl/data.frame)
##  $ Entity                   : chr [1:6468] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Code                     : chr [1:6468] "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                     : num [1:6468] 1990 1991 1992 1993 1994 ...
##  $ Schizophrenia (%)        : num [1:6468] 0.161 0.16 0.16 0.16 0.16 ...
##  $ Bipolar disorder (%)     : num [1:6468] 0.698 0.698 0.698 0.698 0.698 ...
##  $ Eating disorders (%)     : num [1:6468] 0.1019 0.0993 0.0967 0.0943 0.0924 ...
##  $ Anxiety disorders (%)    : num [1:6468] 4.83 4.83 4.83 4.83 4.83 ...
##  $ Drug use disorders (%)   : num [1:6468] 1.68 1.68 1.69 1.71 1.72 ...
##  $ Depression (%)           : num [1:6468] 4.07 4.08 4.09 4.1 4.1 ...
##  $ Alcohol use disorders (%): num [1:6468] 0.672 0.672 0.671 0.67 0.669 ...
#summary(mental_health)

There are 10 columns in the data set.

colnames(mental_health)
##  [1] "Entity"                    "Code"                     
##  [3] "Year"                      "Schizophrenia (%)"        
##  [5] "Bipolar disorder (%)"      "Eating disorders (%)"     
##  [7] "Anxiety disorders (%)"     "Drug use disorders (%)"   
##  [9] "Depression (%)"            "Alcohol use disorders (%)"
# Prevalence distribution for each mental disorder

mental_health %>%
  gather(key = "Disorder", value = "Prevalence", -Entity, -Year) %>%
  mutate(Prevalence = as.numeric(Prevalence)) %>%
  ggplot(aes(x = Prevalence)) +
  geom_histogram(binwidth = 0.5) +
  facet_wrap(~Disorder, scales = "free_x") +
  xlab("Prevalence (%)") +
  ylab("Count") +
  theme_minimal()
## Warning: There was 1 warning in `mutate()`.
## i In argument: `Prevalence = as.numeric(Prevalence)`.
## Caused by warning:
## ! pojawiły się wartości NA na skutek przekształcenia
## Warning: Removed 6468 rows containing non-finite values (`stat_bin()`).

Data Cleaning

mean(rowSums(is.na(mental_health)) > 0) * 100
## [1] 15.15152
mental_health %>%
    gather(Disorder, Prevalence, -Entity, -Year) %>%
    mutate(Missing = is.na(Prevalence)) %>%
    group_by(Disorder, Missing) %>%
    summarise(Count = n()) %>%
    ggplot(aes(x=Disorder, y=Count, fill=Missing)) +
    geom_bar(stat='identity') +
    ylab("Count") +
    theme_minimal()
## `summarise()` has grouped output by 'Disorder'. You can override using the
## `.groups` argument.

There are approximately 15% of rows with missing data - will be removed.

mental_health_clean <- mental_health[complete.cases(mental_health), ]
# Tables with descriptive statistics for each disorder

mental_health %>%
    summarise_at(vars(-Entity, -Year), funs(mean(., na.rm = TRUE), sd(., na.rm = TRUE), sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## i Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## i In argument: `Code_mean = mean(Code, na.rm = TRUE)`.
## Caused by warning in `mean.default()`:
## ! argument nie jest wartością liczbową ani logiczną: zwracanie wartości NA
## i Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1 remaining warning.
## # A tibble: 1 x 24
##   Code_mean Schizophrenia (%)_me~1 Bipolar disorder (%)~2 Eating disorders (%)~3
##       <dbl>                  <dbl>                  <dbl>                  <dbl>
## 1        NA                  0.212                  0.719                  0.240
## # i abbreviated names: 1: `Schizophrenia (%)_mean`,
## #   2: `Bipolar disorder (%)_mean`, 3: `Eating disorders (%)_mean`
## # i 20 more variables: `Anxiety disorders (%)_mean` <dbl>,
## #   `Drug use disorders (%)_mean` <dbl>, `Depression (%)_mean` <dbl>,
## #   `Alcohol use disorders (%)_mean` <dbl>, Code_sd <dbl>,
## #   `Schizophrenia (%)_sd` <dbl>, `Bipolar disorder (%)_sd` <dbl>,
## #   `Eating disorders (%)_sd` <dbl>, `Anxiety disorders (%)_sd` <dbl>, ...

Correlation Analysis

correlation_matrix <- cor(mental_health[, c("Schizophrenia (%)", "Bipolar disorder (%)", "Eating disorders (%)","Anxiety disorders (%)", "Drug use disorders (%)", "Depression (%)","Alcohol use disorders (%)")], use = "complete.obs")
# Select only numeric columns
numeric_data <- mental_health %>% 
    select(where(is.numeric))

# Compute correlation matrix
correlation_matrix <- cor(numeric_data, use = "pairwise.complete.obs")

# Plot correlation matrix
corrplot(correlation_matrix, method = "circle")

plot <- plot_ly(
  z = correlation_matrix,
  x = colnames(correlation_matrix),
  y = colnames(correlation_matrix),
  type = "heatmap",
  colorscale = "RdYlBu"
)

plot <- plot %>% 
  layout(
    title = "Correlation of mental disorders",
    xaxis = list(title = ""),
    yaxis = list(title = "")
  )

plot
# Regression model to investigate how the prevalence of one disorder is associated with another

model <- lm(`Depression (%)` ~ `Anxiety disorders (%)`, data = mental_health_clean)
summary(model)
## 
## Call:
## lm(formula = `Depression (%)` ~ `Anxiety disorders (%)`, data = mental_health_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.27593 -0.42309 -0.05742  0.36680  2.78405 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.675964   0.030845   86.75   <2e-16 ***
## `Anxiety disorders (%)` 0.202317   0.007511   26.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6313 on 5486 degrees of freedom
## Multiple R-squared:  0.1168, Adjusted R-squared:  0.1167 
## F-statistic: 725.6 on 1 and 5486 DF,  p-value: < 2.2e-16
# Relationship between Depression and Anxiety Disorders

ggplot(mental_health_clean, aes(x=`Anxiety disorders (%)`, y=`Depression (%)`)) +
    geom_point(color = "deepskyblue") +
    geom_smooth(method="lm", col="darkviolet") +
    xlab("Anxiety Disorders (%)") +
    ylab("Depression (%)") +
    theme_minimal() +
    labs(title = "Relationship between Depression and Anxiety Disorders")
## `geom_smooth()` using formula = 'y ~ x'

# Relationship between Bipolar and Eating disorders

ggplot(mental_health_clean, aes(x=`Bipolar disorder (%)`, y=`Eating disorders (%)`)) +
    geom_point(color = "deepskyblue") +
    geom_smooth(method="lm", col="darkviolet") +
    xlab("Bipolar disorder (%)") +
    ylab("Eating disorders (%)") +
    theme_minimal() +
    labs(title = "Relationship between Bipolar and Eating disorders")
## `geom_smooth()` using formula = 'y ~ x'

Development of depression over the years

depression_by_year <- aggregate(mental_health_clean$`Depression (%)`, by = list(Year = mental_health_clean$Year), FUN = function(x) c(mean = mean(x), median = median(x)))
plot <- plot_ly(x = depression_by_year$Year, y = depression_by_year$x[,"mean"], type = "scatter", mode = "lines", name = 'Mean') %>%
  add_trace(y = depression_by_year$x[,"median"], name = 'Median') %>% 
  layout(title = "Development of depresion over the years",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Depression level in %"))

plot

Alcohol Use Disorder by Country in 2017

alcohol_disorder_2017 = subset(mental_health_clean, Year == 2017)
# Alcohol Use Disorder by Country in 2017

plot_alcohol <- plot_ly(
    data = alcohol_disorder_2017,
    type = 'choropleth',
    locations = alcohol_disorder_2017$Code,
    z = alcohol_disorder_2017$`Alcohol use disorders (%)`,
    text = alcohol_disorder_2017$Entity,
    colorscale = "RdPu") %>% 
    layout(title = "Alcohol Use Disorder by Country in 2017")
plot_alcohol

Depression by Country in 2017

depression_2017 = subset(mental_health_clean, Year == 2017)
# Depression by Country in 2017

plot_depression <- plot_ly(
    data = depression_2017,
    type = 'choropleth',
    locations = depression_2017$Code,
    z = depression_2017$`Depression (%)`,
    text = depression_2017$Entity,
    colorscale = "RdPu") %>% 
    layout(title = "Depression by Country in 2017")
plot_depression